First we need to import the necessary modules.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
from beeswarm import beeswarm
from mpl_toolkits.mplot3d import Axes3D
from scipy.stats import ttest_ind
from matplotlib.patches import Ellipse,Patch,Arrow
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('png')
from IPython.display import HTML, display
import tabulate
from decimal import Decimal
import warnings
warnings.filterwarnings('ignore') #in order to suppress warnings output of t-test if one sample is 0
pd.set_option('display.max_rows', 1000)
The we read sample files and extract separate data containers for the 4 tissues in question. The data have been normalized.
sampleinfo=pd.read_csv('sample_info_consensus.csv',sep=',')
sampleinfo=sampleinfo.query('paper == "fromm" or paper == "schee" or paper == "selitsky" or paper == "neerincx"')
countmat=pd.read_csv('consensus.rpm_uniq_seqs_correct_names.csv',sep=',')
countmat.rename(columns = {'Unnamed: 0':'mirna'}, inplace = True)
liver_normal=countmat[[row[0] for row in sampleinfo.values if row[3]=='liver' and row[2]=='normal']]
liver_meta=countmat[[row[0] for row in sampleinfo.values if row[3]=='liver' and row[2]=='metastasis']]
crc_normal=countmat[[row[0] for row in sampleinfo.values if row[3]=='colorect' and row[2]=='normal' and row[4]!='schee']]
crc_tumor=countmat[[row[0] for row in sampleinfo.values if row[3]=='colorect' and row[2]=='tumor']]
liver_normal.index=liver_meta.index=crc_normal.index=crc_tumor.index=list(countmat['mirna'])
For each miRNA, certain values are calculated such as t-test relevance values between different tissue types, fold changes between average values and count of samples. This is later used to filter the data. Also, miRNAs that are known to be tissue-related are excluded.
mirnanamelist=[]
mirnadatalist=[]
tissuemirna=["Hsa-Mir-122_5p ","Hsa-Mir-126_5p ","Hsa-Mir-126_3p ","Hsa-Mir-144_5p ","Hsa-Mir-144_3p ","Hsa-Mir-486_5p ",
"Hsa-Mir-143_3p ","Hsa-Mir-145_5p ","Hsa-Mir-150_5p ","Hsa-Mir-142-P1_5p ","Hsa-Mir-223_3p "]
for i in range(len(countmat.values)):
s1,p1=ttest_ind(crc_normal.values[i],crc_tumor.values[i])
s2,p2=ttest_ind(crc_tumor.values[i],liver_meta.values[i])
s3,p3=ttest_ind(crc_normal.values[i],liver_normal.values[i])
s4,p4=ttest_ind(liver_meta.values[i],liver_normal.values[i])
fc_cm_cb=np.mean(crc_tumor.values[i])/np.mean(crc_normal.values[i])
if fc_cm_cb<1:fc_cm_cb=-1/(fc_cm_cb+0.00001)
fc_lm_cm=np.mean(liver_meta.values[i])/np.mean(crc_tumor.values[i])
if fc_lm_cm<1:fc_lm_cm=-1/(fc_lm_cm+0.00001)
fc_lb_cb=np.mean(liver_normal.values[i])/np.mean(crc_normal.values[i])
if fc_lb_cb<1:fc_lb_cb=-1/(fc_lb_cb+0.00001)
fc_lb_lm=np.mean(liver_normal.values[i])/np.mean(liver_meta.values[i])
if fc_lb_lm<1:fc_lb_lm=-1/(fc_lb_lm+0.00001)
n_cb=len(crc_normal.values[i])
n_cm=len(crc_tumor.values[i])
n_lb=len(liver_normal.values[i])
n_lm=len(liver_meta.values[i])
totcounts=np.mean(liver_normal.values[i])+np.mean(liver_meta.values[i])+np.mean(crc_normal.values[i])+np.mean(crc_tumor.values[i])
tm=False
if countmat['mirna'][i] in tissuemirna:tm=True
mirnadatalist.append([p1,p2,p3,p4,fc_cm_cb,fc_lm_cm,fc_lb_cb,fc_lb_lm,n_cb,n_cm,n_lb,n_lm,totcounts,tm])
mirnanamelist.append(countmat['mirna'][i])
mirnalist=pd.DataFrame(data=mirnadatalist,index=mirnanamelist,columns=[
'p_cm_cb',
'p_cm_lm',
'p_cb_lb',
'p_lm_lb',
'fc_cm_cb',
'fc_lm_cm',
'fc_lb_cb',
'fc_lb_lm',
'N_cb',
'N_cm',
'N_lb',
'N_lm',
'total_counts',
'tissue_mirna'])
This is what it looks like for one example miRNA.
mirnalist.T['Hsa-Mir-122_5p ']
Next we define the visualization function called "combplot". The individual experimental miRNA counts are drawn as vertical (liver) and horizontal (colon) lines. The area where they overlap is indicated by an ellipse with the means as center and the standard deviation as half axes. This is done both for benign and malign tissues.
The straight green line indicates the equal count where colon and liver counts are the same (FC=1). An arrow indicates the trend from benign to malign. If the ellipse is top left it means the expression for this miRNA is higher in the colon tissue than in the liver tissue while it is lower if the ellipse is in the bottom right.
Additionally, a classical box plot of the four tissues is shown and overlaid with a beeswarm plot that shows individual measurements.
One example plot is given.
def combplot(mirna):
#prepare figure
fig = plt.figure(figsize=(13,7))
fig.suptitle(mirna,fontsize=20)
#prepare ellipses subplot
ax = fig.add_axes([0.1,0.1,0.44,0.44*13/7])
plt.xlabel('Individual counts (liver)')
plt.ylabel('Individual counts (colon)')
plotlimitmax=max(liver_normal.T[mirna].max(),crc_normal.T[mirna].max(),liver_meta.T[mirna].max(),crc_tumor.T[mirna].max())
plotlimitmin=min(liver_normal.T[mirna].min(),crc_normal.T[mirna].min(),liver_meta.T[mirna].min(),crc_tumor.T[mirna].min())
ax.set_xlim(plotlimitmin, plotlimitmax)
ax.set_ylim(plotlimitmin, plotlimitmax)
#ax.set_aspect('equal')
#prepare ellipses
xn=liver_normal.T[mirna].mean()
yn=crc_normal.T[mirna].mean()
xerrn=2*liver_normal.T[mirna].std()
yerrn=2*crc_normal.T[mirna].std()
xc=liver_meta.T[mirna].mean()
yc=crc_tumor.T[mirna].mean()
xerrc=2*liver_meta.T[mirna].std()
yerrc=2*crc_tumor.T[mirna].std()
ne=Ellipse(xy=(xn,yn), width=xerrn, height=yerrn,color='blue',lw=2,fill=False,alpha=1,label='Benign (Median+St.d.)')
ce=Ellipse(xy=(xc,yc), width=xerrc, height=yerrc,color='red',lw=2,fill=False,alpha=1,label='Malign (Media+St.d.)')
ax.add_artist(ne)
ax.add_artist(ce)
ne.set_zorder(10000)
ce.set_zorder(10000)
#draw arrow between ellipses centers
ar=Arrow(xn,yn,xc-xn,yc-yn,width=plotlimitmax/40,zorder=10001,color='purple')
ax.add_artist(ar)
#draw green line of constant fold change
gfc1,=ax.plot([plotlimitmin,plotlimitmax],[plotlimitmin,1*plotlimitmax],'g',label="FC=1")
gfc2,=ax.plot([plotlimitmin,plotlimitmax],[plotlimitmin,2*plotlimitmax],'g--',label="FC=2")
gfc05,=ax.plot([plotlimitmin,plotlimitmax],[plotlimitmin,0.5*plotlimitmax],'g-.',label="FC=-2")
#draw vertical and horizontal lines for each individual measurement
lw,al=10,0.07
nleg=Patch(color='black',alpha=al*5,label='Benign (ind. meas.)')
cleg=Patch(color='yellow',alpha=al*5,label='Malign (ind. meas.)')
ax.legend(handles=[ne,ce,nleg,cleg,gfc1,gfc2,gfc05],fontsize='small')
for xln in liver_normal.T[mirna].values:
ax.plot([xln,xln],[min(crc_normal.T[mirna].values),max(crc_normal.T[mirna].values)],'k-',linewidth=lw,alpha=al)
for ycn in crc_normal.T[mirna].values:
ax.plot([min(liver_normal.T[mirna].values),max(liver_normal.T[mirna].values)],[ycn,ycn],'k-',linewidth=lw,alpha=al)
for xlc in liver_meta.T[mirna].values:
ax.plot([xlc,xlc],[min(crc_tumor.T[mirna].values),max(crc_tumor.T[mirna].values)],'y-',linewidth=lw,alpha=al)
for ycc in crc_tumor.T[mirna].values:
ax.plot([min(liver_meta.T[mirna].values),max(liver_meta.T[mirna].values)],[ycc,ycc],'y-',linewidth=lw,alpha=al)
#prepare subplot for boxplot and beeswarm plot
ax2=fig.add_axes([0.61, 0.3, 0.35, 0.615])
ax2.boxplot([crc_normal.T[mirna].values,
crc_tumor.T[mirna].values,
liver_meta.T[mirna].values,
liver_normal.T[mirna].values],showmeans=True,meanline=True,
labels=['Colon benign\n N='+str(int(mirnalist.T[mirna][8])),
'Colon malign\n N='+str(int(mirnalist.T[mirna][9])),
'Liver malign\n N='+str(int(mirnalist.T[mirna][11])),
'Liver benign\n N='+str(int(mirnalist.T[mirna][10]))])
def GetColor(x):
colors = []
for item in x.index:
coh=sampleinfo['paper'][sampleinfo['sample']==item].values[0]
if coh == 'fromm': colors.append('red')
if coh == 'schee':colors.append('blue')
if coh == 'neerincx':colors.append('green')
if coh == 'selitsky':colors.append('yellow')
colors=np.array(colors)
dat=x.values
inds = dat.argsort()
sorteddatcol = colors[inds]
colors=sorteddatcol.tolist()
return colors
colors = (GetColor(crc_normal.T[mirna]) + GetColor(crc_tumor.T[mirna]) + GetColor(liver_meta.T[mirna]) + GetColor(liver_normal.T[mirna]))
beeswarm([np.copy(crc_normal.T[mirna].values),
np.copy(crc_tumor.T[mirna].values),
np.copy(liver_meta.T[mirna].values),
np.copy(liver_normal.T[mirna].values)],
ax=ax2,s=10,alpha=0.4,col=colors,positions=[1,2,3,4])
red_patch = Patch(color='red', label='Fromm')
blue_patch =Patch(color='blue', label='Schee')
green_patch = Patch(color='green', label='Neerincx')
yellow_patch = Patch(color='yellow', label='Selitsky')
ax2.legend(handles=[red_patch,blue_patch,green_patch,yellow_patch],fontsize='small')
ax3=fig.add_axes([0.61, 0.1, 0.35, 0.14])
ax3.plot([1,1,2,2],[4.5,4,4,4.5],'k')
ax3.text(1.5,3,'FC = %.2f' % Decimal(str(mirnalist.T[mirna][4]))+', p = %.2e' % Decimal(str(mirnalist.T[mirna][0])),ha='center')
ax3.plot([2,2,3,3],[2.5,2,2,2.5],'k')
ax3.text(2.5,1,'FC = %.2f' % Decimal(str(mirnalist.T[mirna][5]))+', p = %.2e' % Decimal(str(mirnalist.T[mirna][1])),ha='center')
ax3.plot([1,1,4,4],[0.5,0,0,0.5],'k')
ax3.text(2.5,-1,'FC = %.2f' % Decimal(str(mirnalist.T[mirna][6]))+', p = %.2e' % Decimal(str(mirnalist.T[mirna][2])),ha='center')
ax3.plot([3,3,4,4],[-1.5,-2,-2,-1.5],'k')
ax3.text(3.5,-3,'FC = %.2f' % Decimal(str(mirnalist.T[mirna][7]))+', p = %.2e' % Decimal(str(mirnalist.T[mirna][3])),ha='center')
ax3.set_xlim(0.6, 4.4)
ax3.axis('off')
plt.show()
combplot('Hsa-Mir-214_3p ')
Next comes a plot style where only the colon reads are shown.
def combplotcol(mirna):
#prepare figure
fig = plt.figure(figsize=(13,7))
fig.suptitle(mirna,fontsize=20)
#prepare ellipses subplot
ax = fig.add_axes([0.1,0.1,0.44,0.44*13/7])
plt.xlabel('Individual counts (colon benign)')
plt.ylabel('Individual counts (colon malign)')
plotlimitmax=max(crc_normal.T[mirna].max(),crc_tumor.T[mirna].max())
plotlimitmin=min(crc_normal.T[mirna].min(),crc_tumor.T[mirna].min())
ax.set_xlim(plotlimitmin, plotlimitmax)
ax.set_ylim(plotlimitmin, plotlimitmax)
#ax.set_aspect('equal')
#prepare ellipses
yn=crc_tumor.T[mirna].mean()
xn=crc_normal.T[mirna].mean()
yerrn=2*crc_tumor.T[mirna].std()
xerrn=2*crc_normal.T[mirna].std()
ne=Ellipse(xy=(xn,yn), width=xerrn, height=yerrn,color='blue',lw=2,fill=False,alpha=1,label='Median+St.d.')
ax.add_artist(ne)
ne.set_zorder(10000)
#draw green line of constant fold change
gfc1,=ax.plot([plotlimitmin,plotlimitmax],[plotlimitmin,1*plotlimitmax],'g',label="FC=1")
gfc2,=ax.plot([plotlimitmin,plotlimitmax],[plotlimitmin,2*plotlimitmax],'g--',label="FC=2")
gfc05,=ax.plot([plotlimitmin,plotlimitmax],[plotlimitmin,0.5*plotlimitmax],'g-.',label="FC=-2")
#draw vertical and horizontal lines for each individual measurement
lw,al=10,0.07
nleg=Patch(color='black',alpha=al*5,label='ind. meas.')
ax.legend(handles=[ne,nleg,gfc1,gfc2,gfc05],fontsize='small')
for xln in crc_normal.T[mirna].values:
ax.plot([xln,xln],[min(crc_tumor.T[mirna].values),max(crc_tumor.T[mirna].values)],'k-',linewidth=lw,alpha=al)
for ycn in crc_tumor.T[mirna].values:
ax.plot([min(crc_normal.T[mirna].values),max(crc_normal.T[mirna].values)],[ycn,ycn],'k-',linewidth=lw,alpha=al)
#prepare subplot for boxplot and beeswarm plot
ax2=fig.add_axes([0.61, 0.3, 0.35, 0.615])
ax2.boxplot([crc_normal.T[mirna].values,
crc_tumor.T[mirna].values],showmeans=True,meanline=True,
labels=['Colon benign\n N='+str(int(mirnalist.T[mirna][7])),
'Colon malign\n N='+str(int(mirnalist.T[mirna][8]))])
def GetColor(x):
colors = []
for item in x.index:
coh=sampleinfo['paper'][sampleinfo['sample']==item].values[0]
if coh == 'fromm': colors.append('red')
if coh == 'schee':colors.append('blue')
if coh == 'neerincx':colors.append('green')
if coh == 'selitsky':colors.append('yellow')
colors=np.array(colors)
dat=x.values
inds = dat.argsort()
sorteddatcol = colors[inds]
colors=sorteddatcol.tolist()
return colors
colors = (GetColor(crc_normal.T[mirna]) + GetColor(crc_tumor.T[mirna]))
beeswarm([np.copy(crc_normal.T[mirna].values),
np.copy(crc_tumor.T[mirna].values)],
ax=ax2,s=10,alpha=0.4,col=colors,positions=[1,2])
red_patch = Patch(color='red', label='Fromm')
blue_patch =Patch(color='blue', label='Schee')
green_patch = Patch(color='green', label='Neerincx')
yellow_patch = Patch(color='yellow', label='Selitsky')
ax2.legend(handles=[red_patch,blue_patch,green_patch,yellow_patch],fontsize='small')
ax3=fig.add_axes([0.61, 0.14, 0.35, 0.1])
ax3.plot([1,1,2,2],[1.5,1,1,1.5],'k')
ax3.text(1.5,0.3,'FC = %.2f' % Decimal(str(mirnalist.T[mirna][4]))+', p = %.2e' % Decimal(str(mirnalist.T[mirna][0])),ha='center')
ax3.set_xlim(0.8, 2.2)
ax3.set_ylim(-1,2)
ax3.axis('off')
plt.show()
combplotcol(mirnanamelist[4])
Now comes data filtering. The following list consists of miRNA where the total average count of all tissues is higher than 400, the fold change between colon malign and colon benign average values is higher than 2 (or lower than -2) and the t-test probability of these two sample distributions to be equal is smaller than 0.01.
query1='total_counts > 400 and abs(fc_cm_cb) > 2 and p_cm_cb < 0.01 and tissue_mirna==False'
mirnalist.query(query1)
Now all these are plotted:
for item in list(mirnalist.query(query1).index):
combplotcol(item)
Now we are interested in the miRNA where both ellipses are different sides of the FC=1 straight green line, i.e. where the liver malign-colon malign ratio and the liver benign-colon malign ratio are both larger than 1 or both smaller than -1. In order to exclude cases that center around FC=1, higher cutoffs of 1.5 and -1.5, respectively, have been chosen.
As an additional filter, first come the ones where the change of colon malign to liver malign follow the same trend as liver benign to liver malign.
fco=1.5 #cutoff
query2='total_counts > 400 and tissue_mirna==False and ((fc_lm_cm < %f and fc_lb_cb > 1) or (fc_lm_cm > %f and fc_lb_cb < -1) or (fc_lm_cm < -1 and fc_lb_cb > %f) or (fc_lm_cm > 1 and fc_lb_cb < %f)) and fc_lm_cm*fc_lb_lm>0' % (-fco,fco,fco,-fco)
mirnalist.query(query2)
Now all these are plotted:
for item in list(mirnalist.query(query2).index):
combplot(item)
Now come the ones where the change of colon malign to liver malign do not follow the same trend as liver benign to liver malign.
fco=1.5 #cutoff
query12='total_counts > 400 and tissue_mirna==False and ((fc_lm_cm < %f and fc_lb_cb > 1) or (fc_lm_cm > %f and fc_lb_cb < -1) or (fc_lm_cm < -1 and fc_lb_cb > %f) or (fc_lm_cm > 1 and fc_lb_cb < %f)) and fc_lm_cm*fc_lb_lm<0' % (-fco,fco,fco,-fco)
mirnalist.query(query12)
for item in list(mirnalist.query(query12).index):
combplot(item)
Now the ones where the ellipses are on the same side, i.e. the selected miRNA is expressed stronger in one specific tissue, regardless of malign or benign. Additionally, the relative change of liver expression should go in the opposite direction such that liver remnants alone cannot be the reason for the observed changes.
fco=1.5 #cutoff value
query3='total_counts > 400 and tissue_mirna==0 and ((fc_lm_cm > %f and fc_lb_cb > 1 and fc_lb_lm > 1) or (fc_lm_cm < %f and fc_lb_cb < -1 and fc_lb_lm < 1) or (fc_lm_cm > 1 and fc_lb_cb > %f and fc_lb_lm > 1) or (fc_lm_cm < -1 and fc_lb_cb < %f and fc_lb_lm < 1))' % (fco,-fco,fco,-fco)
mirnalist.query(query3)
for item in list(mirnalist.query(query3).index):
combplot(item)
Now come the ones, where the relative change in miRNA expression in liver tissue has the same trend and could be the reason for the observed shift in expression.
fco=1.5 #cutoff value
query4='total_counts > 400 and tissue_mirna==False and ((fc_lm_cm > %f and fc_lb_cb > 1 and fc_lb_lm < 1) or (fc_lm_cm < %f and fc_lb_cb < -1 and fc_lb_lm > 1) or (fc_lm_cm > 1 and fc_lb_cb > %f and fc_lb_lm < 1) or (fc_lm_cm < -1 and fc_lb_cb < %f and fc_lb_lm > 1))' % (fco,-fco,fco,-fco)
mirnalist.query(query4)
for item in list(mirnalist.query(query4).index):
combplot(item)
These are the tissue related miRNAs:
query5='tissue_mirna==True'
mirnalist.query(query5)
for item in list(mirnalist.query(query5).index):
combplot(item)